package com.hrhx.springboot.crawler;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
import com.hrhx.springboot.crawler.plugins.net.selenium.ChromeDriverRequest;
import com.hrhx.springboot.crawler.plugins.net.ssl.TrustAllCertificationRequester;
import com.hrhx.springboot.domain.AutohomeBrand;
import com.hrhx.springboot.mysql.repository.AutohomeBrandRepository;
import com.hrhx.springboot.util.SpringUtil;
import org.apache.commons.lang.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @author duhongming
 * @version 1.0
 * @description TODO
 * @date 2019-12-08 11:01
 */
public class AutohomeCrawler extends BreadthCrawler {
    private static final String API = "https://www.autohome.com.cn/grade/carhtml/${letter}.html";

    private AutohomeBrandRepository repository;

    public AutohomeCrawler(String crawlPath, boolean autoParse) throws Exception {
        super(crawlPath, autoParse);
        repository = SpringUtil.getBean(AutohomeBrandRepository.class);
        for (char i = 'A'; i <= 'Z'; i++) {
            if (i == 'E' || i == 'U') {
                continue;
            }
            this.addSeed(API.replace("${letter}", String.valueOf(i)), "content");
        }
        this.setRequester(new ChromeDriverRequest());
        this.setThreads(1);
        this.start(1);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {

    }

    @MatchType(types = "content")
    public void handleContent(Page page, CrawlDatums crawlDatums) {
        String letterIndex = page.url().replace("https://www.autohome.com.cn/grade/carhtml/","").replace(".html","");
        Elements dls = page.select("body > dl");
        //letterIndex包含的所有品牌
        for (Element dl : dls) {
            String carBrandIconUrl = dl.select("dt > a > img").attr("abs:src");
            Elements carBrandEle = dl.select("dt > div > a");
            String carBrandUrl = carBrandEle.attr("abs:href");
            String carBrand = carBrandEle.text();


            Integer ddSize = dl.select("dd div.h3-tit").size();
            Elements carSeriesEle = dl.select("dd > div.h3-tit > a");
            Elements carModelsEle = dl.select("dd ul.rank-list-ul");

            for (int i = 0; i < ddSize; i++) {

                //车系
                String carSeriesUrl = carSeriesEle.get(i).attr("abs:href");
                String carSeries = carSeriesEle.get(i).text();

                for (Element carModel : carModelsEle) {
                    //车型
                    Elements cars = carModel.select("li");
                    for (int j = 0; j < cars.size(); j++) {
                        if((j+1)%6==0){
                            continue;
                        }
                        AutohomeBrand autohomeBrand = new AutohomeBrand();

                        autohomeBrand.setCarBrandIconUrl(carBrandIconUrl);
                        autohomeBrand.setCarBrandUrl(carBrandUrl);
                        autohomeBrand.setCarBrand(carBrand);

                        autohomeBrand.setCarSeriesUrl(carSeriesUrl);
                        autohomeBrand.setCarSeries(carSeries);
                        autohomeBrand.setCarModelUrl(cars.get(j).select("h4 a").attr("abs:href"));
                        autohomeBrand.setCarModel(cars.get(j).select("h4 a").text());

                        Elements referencePriceELe = cars.get(j).select("div:nth-child(2)");
                        if (StringUtils.isNotBlank(referencePriceELe.text()) && referencePriceELe.text().contains("指导价：")) {
                            autohomeBrand.setReferencePrice(referencePriceELe.text().replace("指导价：", ""));
                        }else{
                            autohomeBrand.setReferencePrice("暂无");
                        }
                        autohomeBrand.setLetterIndex(letterIndex);
                        repository.save(autohomeBrand);
                    }
                }

            }
        }
    }
}
